{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Download all required libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "'''Import of libraries for visualization, data preservation and work in directory'''\n",
    "import os\n",
    "import re\n",
    "from stop_words import get_stop_words\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "from nltk.tokenize import RegexpTokenizer as rt\n",
    "deleter_punct = rt('\\w+')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/spacy/util.py:865: UserWarning: [W095] Model 'es_core_news_sm' (3.1.0) was trained with spaCy v3.1 and may not be 100% compatible with the current version (3.4.1). If you see errors or degraded performance, download a newer compatible model or retrain your custom model with the current spaCy version. For more details and available updates, run: python -m spacy validate\n",
      "  warnings.warn(warn_msg)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "rule\n"
     ]
    }
   ],
   "source": [
    "'''Import of pretrained model for work with Spanish'''\n",
    "import spacy\n",
    "nlp = spacy.load(\"es_core_news_sm\")\n",
    "\n",
    "# !pip install spacy_spanish_lemmatizer\n",
    "\n",
    "lemmatizer = nlp.get_pipe(\"lemmatizer\")\n",
    "print(lemmatizer.mode) "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Open files, save them as list of texts and dictionary (num_of_issue:text).  \\n  Explore them visually."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "os.chdir('./Popular_Film_Issues/Estafeta_Cleaned')\n",
    "path = os.getcwd()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def read_text_file(file_path):\n",
    "    with open(file_path, 'r') as file:\n",
    "        text = file.read()\n",
    "        file.close()\n",
    "    return text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "list_estafeta_texts = []\n",
    "dict_estafeta_texts = {}\n",
    "for file in sorted(os.listdir()):\n",
    "    '''Check whether file is in text format or not'''\n",
    "    if file.endswith(\".txt\"):\n",
    "        file_path = f\"{path}/{file}\"\n",
    "        '''call read text file function, save as dictionary and list'''\n",
    "        dict_estafeta_texts[file[:3]] = read_text_file(file_path)\n",
    "        list_estafeta_texts.append(read_text_file(file_path))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "34765"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'''Explore length of Estafeta in issues'''\n",
    "\n",
    "num_length = {}\n",
    "for key, value in dict_estafeta_texts.items():\n",
    "    text = deleter_punct.tokenize(value)\n",
    "    num_length[int(key)] = len(text)\n",
    "\n",
    "''' length of the whole corpus '''\n",
    "sum(num_length.values())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Stop-words - Tokenization - Lemmatization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "'''Download stop-words'''\n",
    "\n",
    "special_characters = set(\"1234567890\")\n",
    "stop_words = read_text_file('../spanish_stop_words.txt')\n",
    "stop_words = set(stop_words.split('\\n'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "'''Use regular expressions to get rid of names and places at the beginning of the letters'''\n",
    "\n",
    "for key, value in dict_estafeta_texts.items():\n",
    "    text_new = re.sub('\\\\n.+?[—]|^.+?[—]','\\n',value)\n",
    "    text_new = re.sub('\\\\n.+?[—]|^.+?[—]', '\\n', text_new)\n",
    "    dict_estafeta_texts[key]=text_new"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "'''Delete punctuation, check if there are stop-words in the text, lemmatize'''\n",
    "\n",
    "dict_estafeta_text_lemmatized = {}\n",
    "for key, value in dict_estafeta_texts.items():\n",
    "    text = deleter_punct.tokenize(value)\n",
    "    text_lemmatized = ''\n",
    "    for word in text:\n",
    "        doc = nlp(word)\n",
    "        lemma = [token.lemma_ for token in doc][0]\n",
    "\n",
    "        if lemma not in stop_words:\n",
    "            if any((c in special_characters) for c in word)==False:\n",
    "                text_lemmatized = text_lemmatized + lemma + ' '\n",
    "    dict_estafeta_text_lemmatized[key] = text_lemmatized      "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# os.mkdir('../Estafeta_processed_python/')\n",
    "os.chdir('../Estafeta_processed_python/')\n",
    "path = os.getcwd()\n",
    "\n",
    "for key, value in dict_estafeta_text_lemmatized.items():\n",
    "    with open(os.path.join(path, '{}_Estafeta.txt'.format(key)), 'w') as file:\n",
    "        file.write(value)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
